The purpose of this workflow is to look for correlations between DNA methylation and gene expression within-strain, and across-strain.

This workflow uses the output from the RRBS workflow.

## here() starts at /Users/atyler/Documents/Projects/Epigenetics/Epigenetics_Manuscript
all.code.dir <- list.files(here("code"), full.names = TRUE)
for(i in 1:length(all.code.dir)){
    all.fun <- list.files(all.code.dir[i], full.names = TRUE)
    for(j in 1:length(all.fun)){source(all.fun[j])}
}
rrbs.file <- here("Data", "RRBS", "All.Methylation.RDS")
all.var <- ls()
rrbs.loaded <- as.logical(length(which(all.var == "all.rrbs")))

if(!rrbs.loaded){
    all.rrbs <- readRDS(rrbs.file)
}
transcript.info.file <- here("Data", "RNASeq", "RNASeq_gene_info.RData")
transcript.info <- readRDS(transcript.info.file)

ind.name <- names(all.rrbs)
ind.label <- sapply(strsplit(ind.name, ""), function(x) paste(tail(x, 2), collapse = ""))
treat.label <- substr(ind.label, 1, 1)
ind <- substr(ind.label, 2,2)
strain <- mapply(function(x,y) gsub(x, "", y), ind.label, ind.name)
u_strain <- unique(strain)
id.table <- cbind(strain, treat.label, ind)
strain.expr <- readRDS(here("Data", "RNASeq", "Strain_Mean_C_Expression.RData"))
scaled.expr <- lapply(strain.expr, scale)
#scaled.expr <- lapply(strain.expr, function(x) x - x[2]) #center on B6

col.table <- as.matrix(read.table(here("Data", "support_files", "strain.color.table.txt"), 
sep = "\t", comment.char = "%", stringsAsFactors = FALSE))

u_genes <- unique(transcript.info[,"external_gene_name"])
strains <- col.table[,6] #the column that matches the names in the RRBS data

test.num <- length(u_genes)
survey.buffer = 5000 #The number of bp to look up and downstream of specified regions
plot.buffer = 2000 #The number of bp around a point to plot
mean.buffer = 1000 # The number of bp around a point for calculating mean methylation

Methylome

The following code creates a single methylome across all control animals. We created a matrix for each chromosome containing each unique methylated position, and its methylation percent across all individuals. We binned the percent methylation for each position into bins for unmethylated (0%), hemimethylated (50%), and methylated (100%). The goal is to look for general strain patterns in the methylome. Does the methylome have strain-based patterns?

get_methyl_sites <- function(chr, sites, methyl_list){
  chr.locale <- which(methyl_list[,1] == chr)
  chr.table <- methyl_list[chr.locale,]
  site.locale <- match(sites, chr.table[,2])
  return(chr.table[site.locale,4])
} 

methylome_file <- here("Results", "RRBS", "Methylome.RDS")
if(!file.exists(methylome_file) || delete.previous){
  c.locale <- which(id.table[,"treat.label"] == "C")
  all_sites <- vector(mode = "list", length = 19)
  for(chr in 1:19){
    if(is.interactive){print(chr)}
    u_sites <- unique(unlist(lapply(all.rrbs, function(x) x[which(x[,1] == chr),2])))
    strain_sites <- sapply(all.rrbs, function(x) get_methyl_sites(chr, u_sites, x))
    rownames(strain_sites) <- u_sites
    binned_methyl <- apply(strain_sites[,c.locale], 2, function(x) bin.vector(x, c(0, 50, 100)))
    all_sites[[chr]] <- binned_methyl
  }
saveRDS(all_sites, methylome_file)
}else{
all_sites <- readRDS(methylome_file)
}

total.sites <- sum(sapply(all_sites, nrow))

There are 5311670 unique sites in this methylome.

Strain-Specific CpG sites

We investigated whether the CpG sites were differentially detected across strains. We defined a strain-specific site as any site for which methylation values were present for all individuals of some strains, and absent for all individuals of other strains.

#This function finds strain-specific CpG sites. These are
#sites that are present in all individuals of a strain they
#are present in, and absent in all individuals of a strain
#they are absent in.
methyl_by_strain <- function(strain_methyl_mat, strain.idx.list){
  #chunk into strains
  if(is.interactive){cat("Checking for strain-specific methylation sites...\n")}
  chunk.methylome <- lapply(strain.idx.list, function(x) strain_methyl_mat[,x])
  #look for sites that have all or none for all strains
  all_or_none <- matrix(NA, nrow = nrow(strain_methyl_mat), ncol = length(chunk.methylome))
  for(i in 1:length(chunk.methylome)){
    if(is.interactive){cat("\t", i, "\n")}
    all_or_none[,i] <- apply(chunk.methylome[[i]], 1, function(x) all(!is.na(x)) || all(is.na(x)))
  }
  rows_to_check <- which(rowSums(all_or_none) == ncol(all_or_none))
  test.methyl <- strain_methyl_mat[rows_to_check,]
  has.na <- which(is.na(rowSums(test.methyl)))
  strain.based.presence <- test.methyl[has.na,]
  #quartz();pheatmap(strain.based.presence[1:1000,], cluster_cols = FALSE, cluster_rows = FALSE)
  return(strain.based.presence)
}

char_num <- nchar(colnames(all_sites[[1]]))
methylome_strains <- substr(colnames(all_sites[[1]]), 1, (char_num-2))
strain.idx.list <- lapply(col.table[,6], function(x) which(methylome_strains == x))
names(strain.idx.list) <- col.table[,6]
methylome.strain.col <- col.table[match(methylome_strains, col.table[,6]), 3]

#get all strain-specific CpG sites
strain.specific.file <- here("Results", "RRBS", "strain_specific_sites.RDS")
if(!file.exists(strain.specific.file) || delete.previous){
  strain.spec.sites <- lapply(all_sites, function(x) methyl_by_strain(x, strain.idx.list))
  saveRDS(strain.spec.sites, strain.specific.file)
}else{
strain.spec.sites <- readRDS(strain.specific.file)
}

The following plot shows the proportion of sites per chromosome that were differentially detected across the strains. For each chromosome, between 15 and 20% of methylated sites were differentially detected across strains.

num_strain_spec <- sapply(strain.spec.sites, nrow)
num_all <- sapply(all_sites, nrow)
prop_spec <- num_strain_spec/num_all
names(prop_spec) <- 1:19 #one element per chromosome
if(is.interactive){quartz()}
barplot(prop_spec, ylim = c(0, 0.2))

The following plot shows the fold enrichment of strain-specific CpG sites compared with annotated genomic features. They are enriched in CpG islands, which makes sense, as well as promoters, and features in the region of the TSS.

#we can look for enrichment of chromatin states around strain-specific
#CpG sites.
sites_to_bed <- function(strain.methyl.mat, chr, bp.buffer = 1){
  site.pos <- as.numeric(rownames(strain.methyl.mat))
  start.pos <- site.pos - bp.buffer
  end.pos <- site.pos + bp.buffer
  chr.text <- rep(paste0("chr", chr), length(site.pos))
  bed.table <- cbind(chr.text, start.pos, end.pos)
  return(bed.table)
}

save_bed <- function(bed.table, filename){
  gz_file <- paste0(filename, ".gz")
  if(!file.exists(gz_file)){
    write.table(bed.table, filename, quote = FALSE, sep = "\t", 
    col.names = FALSE, row.names = FALSE)
    system(paste("gzip", filename))
  }
  return(gz_file)
}

strain_bed <- Reduce("rbind", lapply(1:length(strain.spec.sites), 
  function(x) sites_to_bed(strain.spec.sites[[x]], x, 1)))
strain_gz_file <- save_bed(strain_bed, here("Data", "RRBS", "Strain_Specific_CpG.bed"))

background.files <- list.files(here("Data", "mm10"), full.names = TRUE)

strain.spec.enrich.file <- here("Data", "RRBS", "strain-specific_site_enrichment.RDS")

if(!file.exists(strain.spec.enrich.file)){
  strain_spec_CpG_enrich <- rep(NA, length(background.files))
  names(strain_spec_CpG_enrich) <- gsub(".bed.gz", "", basename(background.files))

  for(i in 1:length(background.files)){
          if(is.interactive){cat(basename(background.files[i]), "\n")}
          strain_spec_CpG_enrich[i] <- enrichment_bed(
          state.bed = strain_gz_file, 
          annotation.bed = background.files[i], 
          chrom.size.file = here("Data", "support_files", "mm10.txt"))
  }
  saveRDS(strain_spec_CpG_enrich, strain.spec.enrich.file)
}else{
  strain_spec_CpG_enrich <- readRDS(strain.spec.enrich.file)
}

if(is.interactive){quartz()}
par(mar = c(4, 12, 4, 4))
barplot(sort(log10(strain_spec_CpG_enrich)), las = 2, xlab = "lo10(Fold Enrichment)", 
horiz = TRUE, main = "log10 Fold Enrichment for Strain-Specific CpG Sites")

The following plots show PC plots for each chromosome to indicate how the strain-specific sites were distributed.

For the most part, CAST and PWK separate from the other strains, as expected. WSB also has strain-specific methylation sites, but not quite to the extent that CAST and PWK do. DBA and 129 have some strain-specific sites on chromosome 14.

Are these strain-specific sites real, or is this an artifact of the different genome builds and how we localize the sequence reads?

site.distribution <- Reduce("rbind", strain.spec.sites)
site.distribution[which(!is.na(site.distribution))] <- 1
site.distribution[which(is.na(site.distribution))] <- 0
site.decomp.file <- here("Results", "RRBS", "CpG_site_distribution_decomposition.RDS")
if(!file.exists(site.decomp.file) || delete.previous){
  site.dist.decomp <- plot.decomp(t(site.distribution), plot.results = FALSE)
  saveRDS(site.dist.decomp, site.decomp.file)
}else{
  site.dist.decomp <- readRDS(site.decomp.file)
}

rgb.col <- apply(col2rgb(methylome.strain.col), 2, function(x) rgb(x[1]/256, x[2]/256, x[3]/256, 
  alpha = 0.5))

if(is.interactive){quartz(width = 10, height = 5)}
plot(site.dist.decomp$u[,1], site.dist.decomp$u[,2], pch = 16,
cex = 1, col = rgb.col, xlab = "PC1", ylab = "PC2",
main = "Decomposition of Strain-Specific CpG Site Distribution")

Strain-Specific Methylation

There may be strain-specific variation in methylation levels at CpG sites that are present in all strains. We looked for strain-specificity in percent DNA methylation at sites for which all strains had measured methylation. For these PC plots, we binned methylation into 0, 50, and 100%.

#subset to sites that are present across all strains
full_methyl <- lapply(all_sites, function(x) x[which(!is.na(rowSums(x))),])
#count the number of methylation levels across all sites
methyl_levels <- lapply(full_methyl, function(x) apply(x, 1, function(y) length(unique(y))))

#plot.decomp(t(full_methyl[[1]]), label.points = TRUE)

non_varying_sites <- lapply(1:length(full_methyl), 
  function(x) full_methyl[[x]][which(methyl_levels[[x]] == 1),])
non_varying_bed <- Reduce("rbind", lapply(1:length(non_varying_sites), 
  function(x) sites_to_bed(non_varying_sites[[x]], x, 1)))
non_varying_gz <- save_bed(non_varying_bed, here("Data", "RRBS", "non_varying_sites.bed"))

bimodal_sites <- lapply(1:length(full_methyl), 
  function(x) full_methyl[[x]][which(methyl_levels[[x]] == 2),])
bimodal_bed <- Reduce("rbind", lapply(1:length(bimodal_sites), 
  function(x) sites_to_bed(bimodal_sites[[x]], x, 1)))
bimodal_gz <- save_bed(bimodal_bed, here("Data", "RRBS", "bimodal_sites.bed"))

trimodal_sites <- lapply(1:length(full_methyl), 
  function(x) full_methyl[[x]][which(methyl_levels[[x]] == 3),])
trimodal_bed <- Reduce("rbind", lapply(1:length(trimodal_sites), 
  function(x) sites_to_bed(trimodal_sites[[x]], x, 1)))
trimodal_gz <- save_bed(trimodal_bed, here("Data", "RRBS", "trimodal_sites.bed"))

unmethylated_sites <- lapply(1:length(full_methyl), 
  function(x) full_methyl[[x]][which(apply(full_methyl[[x]], 1, function(y) all(y == 0))),])
unmethylated_bed <- Reduce("rbind", lapply(1:length(unmethylated_sites), 
  function(x) sites_to_bed(unmethylated_sites[[x]], x, 1)))
unmethylated_gz <- save_bed(unmethylated_bed, here("Data", "RRBS", "unmethylated_sites.bed"))

fullymethylated_sites <- lapply(1:length(full_methyl), 
  function(x) full_methyl[[x]][which(apply(full_methyl[[x]], 1, function(y) all(y == 100))),])
fullymethylated_bed <- Reduce("rbind", lapply(1:length(fullymethylated_sites), 
  function(x) sites_to_bed(fullymethylated_sites[[x]], x, 1)))
fullymethylated_gz <- save_bed(fullymethylated_bed, here("Data", "RRBS", "fullymethylated_sites.bed"))

The following box plot shows the distribution of proportions of sites on each chromosome that have no variation across strains, have a bimodal distribution (methylated/unmethylated), or a trimodal distribution (methylated/hemimethylated/unmethylated).

site_counts <- sapply(1:length(non_varying_sites), 
  function(x) c(nrow(non_varying_sites[[x]]), nrow(bimodal_sites[[x]]), 
  nrow(trimodal_sites[[x]]), nrow(fullymethylated_sites[[x]]), nrow(unmethylated_sites[[x]])))
colnames(site_counts) <- 1:19
rownames(site_counts) <- c("Non-Varying", "Bimodal", "Trimodal", "Fully_methylated", "Unmethylated")

site_prop <- apply(site_counts, 2, function(x) x/sum(x))
type.cols <- brewer.pal(5, "Set1")

boxplot(t(site_prop), main = "Proportion of CpG sites", ylab = "Proportion", col = type.cols)

The following heat map shows the log10(Fold Enrichment) of each type of CpG site (non-varying, bimodal, or trimodal) relative to annotated positions in the genome.

This shows us that non-varying CpG sites are enriched in promoters and TSS regions, but depleted in enhancers. This pattern was dominated by unmethylated sites. Fully methylated sites were depleted at the TES, and in promoters, but slightly enriched in enhancers. Bimodal and trimodal sites were relatively enriched in enhancers.

CpG.enrich.mat.file <- here("Results", "RRBS", "CpG.types.enrichment.RDS")

if(!file.exists(CpG.enrich.mat.file) || delete.previous){
  test.files <- list.files(path = here("Data", "RRBS"), pattern = ".bed.gz", full.names = TRUE)
  CpG.enrich.mat <- matrix(NA, nrow = length(test.files), ncol = length(background.files))
  rownames(CpG.enrich.mat) <- gsub(".bed.gz", "", basename(test.files))
  colnames(CpG.enrich.mat) <- gsub(".bed.gz", "", basename(background.files))
  for(i in 1:length(background.files)){
    cat(basename(background.files[i]), "\n")
    for(j in 1:length(test.files)){
     cat("\t", basename(test.files[j]), "\n")
      CpG.enrich.mat[j,i] <- enrichment_bed(
        state.bed = test.files[j], 
        annotation.bed = background.files[i], 
        chrom.size.file = here("Data", "support_files", "mm10.txt"))
    }
  }
  saveRDS(CpG.enrich.mat, CpG.enrich.mat.file)
}else{
  CpG.enrich.mat <- readRDS(CpG.enrich.mat.file)
}

row.order <- hclust(dist(log10(CpG.enrich.mat)))$order
col.order <- hclust(dist(t(log10(CpG.enrich.mat))))$order

par(mar = c(10,10,2,2))
imageWithText(round(log10(t(CpG.enrich.mat[row.order,col.order])),2), 
  split.at.vals = TRUE, col.scale = c("blue", "brown"), grad.dir = "ends")
## Loading required package: grid

Methylation by gene

Get methylation percentages for all genes and all strains given the above bp buffers.

strain.gene.methylation <- vector(mode = "list", length = length(strains))
names(strain.gene.methylation) <- strains

for(st in 1:length(strains)){
  gene.methyl.file <- here("Data", "RRBS", paste0("All.Gene.Methylation.", strains[st], ".RDS"))

  if(!file.exists(gene.methyl.file) || delete.previous){
    cat("Getting methylation for", strains[st], "\n")
    gene.methyl <- lapply_pb(u_genes[1:test.num], 
    function(x) get.methyl.by.gene(x, gene.info.table = transcript.info, 
    methyl.data = all.rrbs, methyl.id.table = id.table, 
    upstream.buffer = survey.buffer, downstream.buffer = survey.buffer, 
    strains = strains[st], treatment = "C", strain.means = FALSE))
    names(gene.methyl)  <- u_genes[1:test.num]
    saveRDS(gene.methyl, gene.methyl.file)
  }else{
    gene.methyl <- readRDS(gene.methyl.file)
  }
  strain.gene.methylation[[st]] <- gene.methyl
}

Calculate average RRBS for each gene across replicates.

avg_methyl <- function(methyl_mat, min_rep = 2){
  n_rep <- apply(methyl_mat, 1, function(x) length(which(!is.na(x))))
  to_avg <- which(n_rep >= min_rep)
  if(length(to_avg) == 0){
    return(NA)
  }else{
    avg_methyl <- rowMeans(methyl_mat[to_avg,,drop=FALSE], na.rm = TRUE)
    return(avg_methyl)
  }
}

strain.avg.file <- here("Results", "RRBS", "Strain.Avg.RRBS.RDS")
if(!file.exists(strain.avg.file) || delete.previous){
  strain_avg_rrbs <- vector(mode = "list", length = length(strains))
  names(strain_avg_rrbs) <- strains
  for(st in 1:length(strains)){
    if(is.interactive){
      cat(strains[st], "\n")
      strain_avg_rrbs[[st]] <- lapply_pb(strain.gene.methylation[[st]], 
      function(x) if(length(x) > 3){avg_methyl(x, min_rep = 2)}else{NA})
      names(strain_avg_rrbs[[st]]) <- names(strain.gene.methylation[[st]])
    }else{
      strain_avg_rrbs[[st]] <- lapply(strain.gene.methylation[[st]], 
      function(x) if(length(x) > 3){avg_methyl(x, min_rep = 2)}else{NA})
      names(strain_avg_rrbs[[st]]) <- names(strain.gene.methylation[[st]])
    }
  }
  saveRDS(strain_avg_rrbs, strain.avg.file)
}else{
  strain_avg_rrbs <- readRDS(strain.avg.file)
}

Combine methylation data for each gene across strains.

methyl.mat.file <- here("Results", "RRBS", "Aligned.Methyl.Mats.RDS")
if(!file.exists(methyl.mat.file) || delete.previous){
  methyl.mats <- vector(mode = "list", length = length(strain_avg_rrbs[[1]]))
  names(methyl.mats) <- names(strain_avg_rrbs[[1]])
  for(i in 1:length(methyl.mats)){
    if(is.interactive){report.progress(i, length(methyl.mats))}
    all.strain.rrbs <- lapply(strain_avg_rrbs, function(x) x[[i]])
    all.pos <- sort(unique(unlist(lapply(all.strain.rrbs, function(x) as.numeric(names(x))))))
    strain.mat <- matrix(NA, ncol = length(all.pos), nrow = length(all.strain.rrbs))
    rownames(strain.mat) <- names(all.strain.rrbs)
    colnames(strain.mat) <- all.pos
    for(j in 1:length(all.strain.rrbs)){
      strain.mat[j,match(names(all.strain.rrbs[[j]]), all.pos)] <- all.strain.rrbs[[j]]
    }
    methyl.mats[[i]] <- strain.mat
    #pheatmap(strain.mat, cluster_rows = FALSE, cluster_cols = FALSE)
  }
saveRDS(methyl.mats, methyl.mat.file)
}else{
  methyl.mats <- readRDS(methyl.mat.file)
}

#quartz();pheatmap(methyl.mats[[1]], cluster_rows = FALSE, cluster_cols = FALSE)

Normalize gene coordinates to run from 0 to 1 from the TSS to the TES.

norm.rrbs.file <- here("Results", "RRBS", "RRBS.Normalized.Position.RDS")

if(!file.exists(norm.rrbs.file) || delete.previous){
  #first center all the coordinates
    norm_rrbs_coord <- lapply_pb(1:length(methyl.mats), 
        function(x) if(length(methyl.mats[[x]]) > 1){center.on.feature(gene.name = names(methyl.mats)[x], 
        gene.info = transcript.info, vals = methyl.mats[[x]][1,], 
        feature = "full")}else{NA})

  #put the normalized coordinates onto the methyl.mats
    norm_rrbs <- methyl.mats
    names(norm_rrbs) <- names(methyl.mats)
    for(i in 1:length(norm_rrbs)){
      colnames(norm_rrbs[[i]]) <- names(norm_rrbs_coord[[i]])
    }
    saveRDS(norm_rrbs, norm.rrbs.file)
}else{
    norm_rrbs <- readRDS(norm.rrbs.file)
}
#quartz();pheatmap(norm_rrbs[[1]], cluster_rows = FALSE, cluster_cols = FALSE)

Methylation density by gene position

We looked at where the densest DNA methylation occurred relative to the gene body. Methylation is densest at the TSS, and very sparse within gene bodies.

Density of methylated positions. For each gene, we looked at the distance of each DNA methylation position to the next position. Around 80% of DNA methylation positions were a single base pair from the next position in all strains.

delete.previous = TRUE

methyl_consec <- lapply(methyl.mats, function(x) if(length(x) > 1){consec.pairs(as.numeric(colnames(x)))}else{NA})
methyl_dist <- lapply(methyl_consec, function(x) if(length(x) >  1){x[,2] - x[,1]}else{NA})
for(i in 1:length(methyl_dist)){
  if(!is.null(colnames(norm_rrbs[[i]]))){
    consec.coord <- consec.pairs(as.numeric(colnames(norm_rrbs[[i]])))
    mean.coords <- rowMeans(consec.coord)
    names(methyl_dist[[i]]) <- mean.coords
  }
}
if(is.interactive){quartz()}
methyl_dens_file <- here("Results", "RRBS", "Methylation_Density.RDS")
if(!file.exists(methyl_dens_file) || delete.previous){
  avg_methyl_dist <- plot.centered.vals(val.list = methyl_dist, 
    seq.by = seq.by, min.representation = 10, ylim = c(0, 3500),
    plot.label = paste("Average Distance to Next Methylation Position"),
    ylab = "Distance to Next Methylation Position", return.means = FALSE)
  abline(v = c(0,1))
  saveRDS(avg_methyl_dist, methyl_dens_file)
}else{
  avg_methyl_dist <- readRDS(methyl_dens_file)
  plot.centered.mat(avg_methyl_dist, ylim = c(0, 3500),
    plot.label = paste("Average Distance to Next Methylation Position"),
    ylab = "Distance to Next Methylation Position")
      abline(v = c(0,1))
}

delete.previous = FALSE

DNA methylation percentage

The following plots show the percent methylation across the gene body. Methylation sites outside gene bodies have 50% methylation. Sites at the TSS have very low percentage methylation, and sites within the gene body have very high methylation.

delete.previous = TRUE

strain.methyl.file <- here("Results", "RRBS", "Strain.RRBS.Percent.RDS")
if(!file.exists(strain.methyl.file) || delete.previous){
  strain_rrbs_mat <- vector(mode = "list", length = length(strains))
  names(strain_rrbs_mat) <- strains
  for(st in 1:length(strains)){
    cat("###", strains[st], "\n")
    norm_strain_avg <- lapply(norm_rrbs, function(x) if(length(x) > 1){x[st,]}else{NA}) 
    strain_rrbs_mat[[st]] <- plot.centered.vals(val.list = norm_strain_avg, 
    return.means = FALSE, seq.by = seq.by, ylim = c(0,100), 
    plot.label = paste("Percent Methylation Across the Gene Body in", strains[st]))
    abline(v = c(0,1), lty = 2, col = "gray", lwd = 2)
    cat("\n\n")
  }
  saveRDS(strain_rrbs_mat, strain.methyl.file)
}else{
  strain_rrbs_mat <- readRDS(strain.methyl.file)
}

129

AJ

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in plot.centered.vals(val.list = norm_strain_avg, return.means =
## FALSE, : NAs introduced by coercion

## Warning in plot.centered.vals(val.list = norm_strain_avg, return.means =
## FALSE, : NAs introduced by coercion

B6

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

CA

DB

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

ND

## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in plot.centered.vals(val.list = norm_strain_avg, return.means =
## FALSE, : NAs introduced by coercion

## Warning in plot.centered.vals(val.list = norm_strain_avg, return.means =
## FALSE, : NAs introduced by coercion

NZ

## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in plot.centered.vals(val.list = norm_strain_avg, return.means =
## FALSE, : NAs introduced by coercion

## Warning in plot.centered.vals(val.list = norm_strain_avg, return.means =
## FALSE, : NAs introduced by coercion

## Warning in plot.centered.vals(val.list = norm_strain_avg, return.means =
## FALSE, : NAs introduced by coercion

PK

## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in plot.centered.vals(val.list = norm_strain_avg, return.means =
## FALSE, : NAs introduced by coercion

## Warning in plot.centered.vals(val.list = norm_strain_avg, return.means =
## FALSE, : NAs introduced by coercion

WS

## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in plot.centered.vals(val.list = norm_strain_avg, return.means =
## FALSE, : NAs introduced by coercion

## Warning in plot.centered.vals(val.list = norm_strain_avg, return.means =
## FALSE, : NAs introduced by coercion

delete.previous = FALSE

The following plot shows the median percent methylation by position.

all_rrbs_met <- lapply(norm_rrbs, 
  function(x) if(length(x) > 1){colMeans(apply(x, 2, as.numeric), na.rm = TRUE)}else{NA})

all_rrbs_med <- t(sapply(strain_rrbs_mat, function(x) apply(x, 2, 
  function(y) median(y, na.rm = TRUE))))
rel.pos <- as.numeric(colnames(all_rrbs_med))

if(is.interactive){quartz()}
plot.new()
plot.window(xlim = c(min(rel.pos), max(rel.pos)), ylim = c(0, 100))
for(i in 1:nrow(all_rrbs_med)){
  points(rel.pos, all_rrbs_med[i,], col = col.table[i,3], lwd = 2, type = "l")
}
axis(1);axis(2)
abline(v = c(0,1))
mtext(side = 1, text = "Relative Genomic Position", line = 2.5)
mtext(side = 2, text = "Median Percent DNA Methylation", line = 2.5)

The following plot shows data for all strains together.

all.strain.mat <- Reduce("rbind", strain_rrbs_mat)
plot.centered.mat(all.strain.mat, ylim = c(0,100), 
plot.label = "Percent Methylation Across the Gene Body", 
show.var = "se", plot.fun = "median", min.upstream = -0.5,
max.downstream = 1.5)
abline(v = c(0,1), lty = 2, col = "gray", lwd = 2)

Non-varying, Bimodal, and Trimodal by gene

I wanted to dig a bit deeper into these types of sites to see if they are enriched anywhere in the gene body, or in specific sets of genes.

type_by_norm_pos <- function(norm.rrbs.mat, n.states = 1, min.strains = 3){
  binned.rrbs <- apply(norm.rrbs.mat, 2, function(x) bin.vector(x, c(0, 50, 100)))
  num.strains <- apply(binned.rrbs, 2, function(x) length(which(!is.na(x))))
  dont.pass <- which(num.strains < min.strains)
  num.sites <- apply(binned.rrbs, 2, function(x) length(unique(x[which(!is.na(x))])))
  type.present <- rep(0, ncol(norm.rrbs.mat))
  type.present[which(num.sites == n.states)] <- 1
  type.present[dont.pass] <- NA 
  names(type.present) <- colnames(norm.rrbs.mat)
  return(type.present)
}

non.varying.rrbs.file <- here("Results", "RRBS", "site.type.nonvarying.RDS")
if(!file.exists(non.varying.rrbs.file) || delete.previous){
  non_varying_rrbs <- lapply_pb(norm_rrbs, 
    function(x) if(length(x) > 1){type_by_norm_pos(x, 1, 3)}else{NA})
  saveRDS(non_varying_rrbs, non.varying.rrbs.file)
}else{
  non_varying_rrbs <- readRDS(non.varying.rrbs.file)
}


bimodal.rrbs.file <- here("Results", "RRBS", "site.type.bimodal.RDS")
if(!file.exists(bimodal.rrbs.file) || delete.previous){
  bimodal_rrbs <- lapply_pb(norm_rrbs, 
    function(x) if(length(x) > 1){type_by_norm_pos(x, 2, 3)}else{NA})
  saveRDS(bimodal_rrbs, bimodal.rrbs.file)
}else{
  bimodal_rrbs <- readRDS(bimodal.rrbs.file)
}

trimodal.rrbs.file <- here("Results", "RRBS", "site.type.trimodal.RDS")
if(!file.exists(trimodal.rrbs.file) || delete.previous){
  trimodal_rrbs <- lapply_pb(norm_rrbs, 
    function(x) if(length(x) > 1){type_by_norm_pos(x, 3, 3)}else{NA})
  saveRDS(trimodal_rrbs, trimodal.rrbs.file)
}else{
  trimodal_rrbs <- readRDS(trimodal.rrbs.file)
}

The following plots show the frequencies of each type of CpG site as they occur along the gene body.

The majority of sites (60%) are non-varying everywhere. This increases to about 80% at the TSS. The dashed horizongal line markd 50%.

delete.previous = TRUE

non_varying_mat <- plot.centered.vals(non_varying_rrbs, seq.by = seq.by, ylim = c(0,1), 
  return.means = FALSE, plot.label = "Non-Varying")
#plot.centered.mat(non_varying_mat, ylim = c(0, 1), plot.label = "Non-Varying")
abline(v = c(0,1))
abline(h = 0.5, col = "darkgray", lty = 2)

tss_non_var <- lapply(non_varying_rrbs, function(x) x[get.nearest.pt(as.numeric(names(x)), 0)])
non_var_genes <- which(sapply(tss_non_var, function(x) length(x) > 0 && x == 1))
non_var_enrich <- gost(names(non_var_genes), organism = "mmusculus")
par(mfrow = c(1,2))
plot.enrichment.wordcloud(non_var_enrich)

delete.previous = FALSE

Around 35% of sites are bimodal. Around 20% of TSS sites are bimodal. These genes are enriched for response to stimuli.

delete.previous = TRUE

par(oldPar)
bimodal_mat <- plot.centered.vals(bimodal_rrbs, seq.by = seq.by, ylim = c(0,0.6), 
  return.means = FALSE, plot.label = "Bimodal")
#plot.centered.mat(bimodal_mat, ylim = c(0, 0.6), plot.label = "Bimodal")
abline(v = c(0,1))
abline(h = 0, col = "darkgray", lty = 2)

tss_bimodal <- lapply(bimodal_rrbs, function(x) x[get.nearest.pt(as.numeric(names(x)), 0)])
bimodal_genes <- which(sapply(tss_bimodal, function(x) length(x) > 0 && x == 1))
bimodal_enrich <- gost(names(bimodal_genes), organism = "mmusculus")
par(mfrow = c(1,2))
plot.enrichment.wordcloud(bimodal_enrich)

delete.previous = FALSE

Very few sites, roughly 4% are trimodal in any given position. Just under 3% are trimodal at the TSS. These genes are not enriched for anything.

delete.previous = TRUE

par(oldPar)
trimodal_mat <- plot.centered.vals(trimodal_rrbs, seq.by = seq.by, ylim = c(0,0.08), 
  return.means = FALSE, plot.label = "Trimodal")
#plot.centered.mat(trimodal_mat, ylim = c(0, 0.08), plot.label = "Trimodal")
abline(v = c(0,1))
abline(h = 0, col = "darkgray", lty = 2)

tss_trimodal <- lapply(trimodal_rrbs, function(x) x[get.nearest.pt(as.numeric(names(x)), 0)])
trimodal_genes <- which(sapply(tss_trimodal, function(x) length(x) > 0 && x == 1))
trimodal_enrich <- gost(names(trimodal_genes), organism = "mmusculus")
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
par(mfrow = c(1,2))
plot.enrichment.wordcloud(trimodal_enrich)
## NULL
delete.previous = FALSE

Within-Strain Effect of DNA Methylation on Expression

The following plot shows the effect of percent DNA methylation on gene expression across all strains, normalized by position.

There is a very weak negative effect of DNA methylation on gene expression near the TSS. Other than that, there is no overall effect of DNA methylation.

rrbs.expr.effect <- vector(mode = "list", length = length(strains))
names(rrbs.expr.effect) <- strains
strain.expr.order <- order.strains(strains,names(strain.expr[[1]]), col.table)

#align gene methylation percentages with expression in this strain.
common.genes <- Reduce("intersect", lapply(strain_rrbs_mat, rownames))
gene.id <- transcript.info[match(common.genes, 
  transcript.info[,"external_gene_name"]),"ensembl_gene_id"]
gene.idx <- match(gene.id, names(strain.expr))
strain.expr.mat <- t(sapply(gene.idx, function(x) if(length(strain.expr[[x]]) > 1){strain.expr[[x]]}else{rep(NA, 9)}))
rownames(strain.expr.mat) <- common.genes
colnames(strain.expr.mat) <- rownames(strain.expr[[1]])

for(st in 1:length(strains)){
  strain_mat <- strain_rrbs_mat[[st]][match(common.genes, rownames(strain_rrbs_mat[[st]])),]
  rrbs.expr.effect[[st]] <- apply(strain_mat, 2, 
    function(x) plot.with.model(x, strain.expr.mat[,strain.expr.order[st]], 
    confidence = 0.99, plot.results = FALSE))
}

rel.pos <- as.numeric(colnames(rrbs.expr.effect[[1]]))
all.fit <- rowMeans(sapply(rrbs.expr.effect, function(x) x[3,]))
all.lwr <- rowMeans(sapply(rrbs.expr.effect, function(x) x[4,]))
all.upr <- rowMeans(sapply(rrbs.expr.effect, function(x) x[5,]))

upstream.limit <- -0.5
downstream.limit <- 1.5
min.fit = -0.02
max.fit = 0.02

if(is.interactive){quartz(width = 7, height = 5)}
plot.new()
plot.window(xlim = c(upstream.limit, downstream.limit), ylim = c(min.fit, max.fit)) 
plot.poly.xy(rel.pos, all.lwr, rel.pos, all.upr, col = "gray80", border = "gray80")
points(rel.pos, all.fit, type = "l", col = "gray40", lwd = 2)
axis(1)
axis(2)
mtext(text = "Relative Gene Position", side = 1, line = 2.5)
mtext(text = "Effect Size", side = 2, line = 2.5)
abline(v = c(0,1), lwd = 2, col = "gray60", lty = 2)
abline(h = 0, col = "gray40", lty = 2)

#pheatmap(rrbs.expr.cor, cluster_cols = FALSE, cluster_rows = FALSE)

within.strain.plotting.data <- cbind(rel.pos, all.lwr, all.fit, all.upr)
colnames(within.strain.plotting.data) <- c("position", "lwr", "fit", "upr")
saveRDS(within.strain.plotting.data, here("Results", "RRBS", "Plotting_Within_Strain.RDS"))

Across-strain variation in DNA methylation

For each gene, we also looked for inter-strain variation in expression associated with DNA methylation.

We used the aligned DNA methylation values generated above. For each gene, we correlated DNA methylation percent with expression across strains.

There is no correlation between gene expression and DNA methylation across strains.

rrbs.by.gene.file <- here("Results", "RRBS", "RRBS.by.Gene.RDS")

if(!file.exists(rrbs.by.gene.file) || delete.previous){
  gene.by.strain <- lapply_pb(common.genes, function(y) sapply(strain_rrbs_mat, function(x) x[which(rownames(x) == y),]))
  names(gene.by.strain) <- common.genes
  saveRDS(gene.by.strain, rrbs.by.gene.file)
}else{
  gene.by.strain <- readRDS(rrbs.by.gene.file)
}


gene.expr.cor.file <- here("Results", "RRBS", "RRBS.Expr.Cor.Across.Strains.RDS")
if(!file.exists(gene.expr.cor.file) || delete.previous){
  expr.gene.names <- transcript.info[match(names(strain.expr), 
      transcript.info[,"ensembl_gene_id"]),"external_gene_name"]
  common.genes <- Reduce("intersect", lapply(strain_rrbs_mat, rownames))
  scaled.expr.mat <- t(sapply(scaled.expr[match(common.genes, expr.gene.names)], 
    function(x) if(length(x) > 1){x[expr.order,]}else{rep(NA, 9)}))
  expr.order <- order.strains(strains, names(strain.expr[[1]]), col.table)

  stat.mat <- matrix(NA, nrow = ncol(strain_rrbs_mat[[1]]), ncol = 5)
  colnames(stat.mat) <- c("r.cor", "p", "slope.fit", "slope.lwr", "slope.upr")
  rownames(stat.mat) <- colnames(strain_rrbs_mat[[1]])
  
  for(pos in 1:ncol(strain_rrbs_mat[[1]])){
    pos.rrbs <- sapply(strain_rrbs_mat, function(x) x[match(common.genes, rownames(x)),pos])
    stat.mat[pos,] <- plot.with.model(as.vector(pos.rrbs), as.vector(scaled.expr.mat), 
      report = "cor.test", confidence = 0.95, plot.results = FALSE)
  }
  saveRDS(stat.mat, gene.expr.cor.file)
}else{
  stat.mat <- readRDS(gene.expr.cor.file)
}

if(is.interactive){quartz()}
plot.new()
plot.window(xlim = c(-0.5, 1.5), ylim = c(-0.01, 0.01))
plot.poly.xy(rel.pos, stat.mat[,"slope.lwr"], rel.pos, stat.mat[,"slope.upr"],
col = "gray80", border = "gray80")
points(rel.pos, stat.mat[,"slope.fit"], type = "l", col = "gray40", lwd = 2)
axis(1);axis(2)
abline(v = c(0,1), lwd = 2, col = "gray60", lty = 2)
abline(h = 0, col = "gray40", lty = 2)
mtext("Effect Size", side = 2, line = 2.5)
mtext("Effect of DNA methylation on Gene Expression Across Strains", side = 3)

across.strain.plotting.data <- cbind(rel.pos, stat.mat[,"slope.lwr"], 
  stat.mat[,"slope.fit"], stat.mat[,"slope.upr"])
colnames(across.strain.plotting.data) <- c("position", "lwr", "fit", "upr")
saveRDS(across.strain.plotting.data, here("Results", "RRBS", "Plotting_Across_Strain.RDS"))

Gene expression by presence/absence

So far we have looked at how percent methylation relates to gene expression. Our data, and data from other suggest that the percent methylation at any given CpG site is not particularly heritable.

However, the presence or absence of a CpG site might be heritable, as it might be due to a SNP that either creates or destroys the site.

Therefore, we also want to look at the correlation between site presence/absence and gene expression across strains.

pres.expr.cor.file <- here("Results", "RRBS", "RRBS.presence.Expr.Cor.Across.Strains.RDS")

if(!file.exists(pres.expr.cor.file) || delete.previous){
  expr.gene.names <- transcript.info[match(names(strain.expr), 
      transcript.info[,"ensembl_gene_id"]),"external_gene_name"]
  common.genes <- Reduce("intersect", lapply(strain_rrbs_mat, rownames))
  scaled.expr.mat <- t(sapply(scaled.expr[match(common.genes, expr.gene.names)], 
    function(x) if(length(x) > 1){x[expr.order,]}else{rep(NA, 9)}))
  expr.order <- order.strains(strains, names(strain.expr[[1]]), col.table)

  stat.mat <- matrix(NA, nrow = ncol(strain_rrbs_mat[[1]]), ncol = 5)
  colnames(stat.mat) <- c("r.cor", "p", "slope.fit", "slope.lwr", "slope.upr")
  rownames(stat.mat) <- colnames(strain_rrbs_mat[[1]])
  
  for(pos in 1:ncol(strain_rrbs_mat[[1]])){
    pos.rrbs <- sapply(strain_rrbs_mat, function(x) x[match(common.genes, rownames(x)),pos])
    has.any <- which(apply(pos.rrbs, 1, function(x) !all(is.na(x))))
    bin.rrbs <- pos.rrbs
    bin.rrbs[which(is.na(pos.rrbs))] <- 0
    bin.rrbs[which(!is.na(pos.rrbs))] <- 1
    stat.mat[pos,] <- plot.with.model(as.vector(bin.rrbs[has.any,]), 
      as.vector(scaled.expr.mat[has.any,]), 
      report = "cor.test", confidence = 0.95, plot.results = FALSE)
  }
  saveRDS(stat.mat, pres.expr.cor.file)
}else{
  stat.mat <- readRDS(pres.expr.cor.file)
}

coords <- as.numeric(colnames(strain_rrbs_mat[[1]]))
if(is.interactive){quartz()}
plot.new()
plot.window(xlim = c(-0.5, 1.5), ylim = c(-0.1, 0.1))
plot.poly.xy(coords, stat.mat[,"slope.lwr"], coords, stat.mat[,"slope.upr"],
col = "gray80", border = "gray80")
points(coords, stat.mat[,"slope.fit"], type = "l", col = "gray40", lwd = 2)
axis(1);axis(2)
abline(v = c(0,1), lwd = 2, col = "gray60", lty = 2)
abline(h = 0, col = "gray40", lty = 2)
mtext("Effect of CpG Site Presence on Gene Expression", side = 3)
mtext("Effect Size", side = 2, line = 2.5)